{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/train-ms-en.tar.gz\n",
    "# !tar -zxf train-ms-en.tar.gz\n",
    "# !rm train-ms-en.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/train-en-ms.tar.gz\n",
    "# !tar -zxf train-en-ms.tar.gz\n",
    "# !rm train-en-ms.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v1.json\n",
      "augmented-en-ms-v2-part2.json\n",
      "augmented-en-ms-v2-part3.json\n",
      "augmented-en-ms-v2.json\n",
      "augmented-en-ms-v3-part2.json\n",
      "augmented-en-ms-v3.json\n"
     ]
    }
   ],
   "source": [
    "url = 'https://huggingface.co/datasets/mesolitica/noisy-en-ms-augmentation/resolve/main'\n",
    "files = [\n",
    "'augmented-en-ms-v1.json',\n",
    "'augmented-en-ms-v2-part2.json',\n",
    "'augmented-en-ms-v2-part3.json',\n",
    "'augmented-en-ms-v2.json',\n",
    "'augmented-en-ms-v3-part2.json',\n",
    "'augmented-en-ms-v3.json'\n",
    "]\n",
    "\n",
    "for f in files:\n",
    "    print(f)\n",
    "    if os.path.exists(f):\n",
    "        continue\n",
    "    os.system(f'wget -q {url}/{f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-1.json\n",
      "augmented-ms-en-2.json\n",
      "augmented-ms-en-3.json\n",
      "augmented-ms-en-v2.json\n",
      "augmented-ms-en-v3.json\n",
      "augmented-ms-en-v4.json\n"
     ]
    }
   ],
   "source": [
    "url = 'https://huggingface.co/datasets/mesolitica/noisy-ms-en-augmentation/resolve/main'\n",
    "files = [\n",
    "    'augmented-ms-en-1.json',\n",
    "    'augmented-ms-en-2.json',\n",
    "    'augmented-ms-en-3.json',\n",
    "    'augmented-ms-en-v2.json',\n",
    "    'augmented-ms-en-v3.json',\n",
    "    'augmented-ms-en-v4.json',\n",
    "]\n",
    "\n",
    "for f in files:\n",
    "    print(f)\n",
    "    if os.path.exists(f):\n",
    "        continue\n",
    "    os.system(f'wget -q {url}/{f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "left.txt  right.txt\r\n"
     ]
    }
   ],
   "source": [
    "!ls train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "left.txt  right.txt\r\n"
     ]
    }
   ],
   "source": [
    "!ls train-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from glob import glob\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████| 3712555/3712555 [00:13<00:00, 267178.98it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('train-noisy.json', 'w') as fopen_jsonl:\n",
    "    with open('train/left.txt') as fopen:\n",
    "        left = fopen.read().split('\\n')\n",
    "\n",
    "    with open('train/right.txt') as fopen:\n",
    "        right = fopen.read().split('\\n')\n",
    "    \n",
    "    for i in tqdm(range(len(left))):\n",
    "        if len(left[i]) and len(right[i]) and len(left[i].split()) < 256 and len(right[i].split()) < 256:\n",
    "            d = {\"translation\": {\"src\": left[i], \"tgt\": right[i], 'prefix': 'terjemah Melayu ke Inggeris: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████| 3807616/3807616 [00:19<00:00, 198858.72it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    with open('train-en/left.txt') as fopen:\n",
    "        left = fopen.read().split('\\n')\n",
    "\n",
    "    with open('train-en/right.txt') as fopen:\n",
    "        right = fopen.read().split('\\n')\n",
    "    \n",
    "    for i in tqdm(range(len(left))):\n",
    "        if len(left[i]) and len(right[i]) and len(left[i].split()) < 256 and len(right[i].split()) < 256:\n",
    "            d = {\"translation\": {\"src\": left[i], \"tgt\": right[i], 'prefix': 'terjemah Inggeris ke Melayu: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v2-part3.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 77100/77100 [00:00<00:00, 209821.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v2-part2.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 231233/231233 [00:00<00:00, 239004.41it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v1.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 984790/984790 [00:04<00:00, 223318.67it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v3.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 69976/69976 [00:00<00:00, 245014.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v2.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 249095/249095 [00:01<00:00, 240526.76it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-en-ms-v3-part2.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 138982/138982 [00:00<00:00, 255305.21it/s]\n"
     ]
    }
   ],
   "source": [
    "noisy_en_ms = glob('augmented-en-ms-*.json')\n",
    "\n",
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    for f in noisy_en_ms:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            noisy = json.load(fopen)\n",
    "    \n",
    "        for i in tqdm(range(len(noisy['en']))):\n",
    "            if len(noisy['en'][i]) and len(noisy['ms'][i]) and len(noisy['en'][i].split()) < 256 and len(noisy['ms'][i].split()) < 256:\n",
    "                d = {\"translation\": {\"src\": noisy['en'][i], \"tgt\": noisy['ms'][i], 'prefix': 'terjemah Inggeris ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-2.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 138131/138131 [00:00<00:00, 257593.42it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-v3.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 419750/419750 [00:01<00:00, 257036.12it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-1.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 273070/273070 [00:01<00:00, 256876.44it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-3.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 335872/335872 [00:01<00:00, 262507.14it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-v4.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████| 1458303/1458303 [00:05<00:00, 261173.27it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "augmented-ms-en-v2.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 84178/84178 [00:00<00:00, 240585.01it/s]\n"
     ]
    }
   ],
   "source": [
    "noisy_ms_en = glob('augmented-ms-en-*.json')\n",
    "\n",
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    for f in noisy_ms_en:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            noisy = json.load(fopen)\n",
    "    \n",
    "        for i in tqdm(range(len(noisy['en']))):\n",
    "            if len(noisy['en'][i]) and len(noisy['ms'][i]) and len(noisy['en'][i].split()) < 256 and len(noisy['ms'][i].split()) < 256:\n",
    "                d = {\"translation\": {\"src\": noisy['ms'][i], \"tgt\": noisy['en'][i], 'prefix': 'terjemah Melayu ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3772684716131387"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random\n",
    "random.random()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 11786274/11786274 [00:45<00:00, 256227.02it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    with open('eng_Latn-zsm_Latn.left') as fopen:\n",
    "        left = fopen.read().split('\\n')\n",
    "    \n",
    "    with open('eng_Latn-zsm_Latn.right') as fopen:\n",
    "        right = fopen.read().split('\\n')\n",
    "        \n",
    "    for i in tqdm(range(len(left))):\n",
    "        if len(left[i]) and len(right[i]) and len(left[i].split()) < 256 and len(right[i].split()) < 256 and random.random() > 0.4:\n",
    "            d = {\"translation\": {\"src\": left[i], \"tgt\": right[i], 'prefix': 'terjemah Inggeris ke Melayu: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            d = {\"translation\": {\"src\": right[i], \"tgt\": left[i], 'prefix': 'terjemah Melayu ke Inggeris: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en-ms-augment-v1.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:16, 45866.92it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en-ms-augment-v2.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:17, 42939.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en-ms-augment-v3.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:13, 53200.01it/s]\n"
     ]
    }
   ],
   "source": [
    "noisy_files = sorted(glob('en-ms-augment-v*.jsonl'))\n",
    "\n",
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    for f in noisy_files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                if random.random() > 0.3:\n",
    "                    continue\n",
    "                data = json.loads(l)\n",
    "                for d in data:\n",
    "                    if len(d['augmented']) and len(d['right']) and len(d['augmented'].split()) < 256 and len(d['right'].split()) < 256:\n",
    "                        d = {\"translation\": {\"src\": d['augmented'], \"tgt\": d['right'], 'prefix': 'terjemah Inggeris ke Melayu: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"Jump from ke puncak lompatan the gunung dan lakukan some yang tricks!\", \"tgt\": \"Naik ke puncak gunung dan lakukan aksi lompatan yang menegangkan.\", \"prefix\": \"terjemah Inggeris ke Melayu: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 1 train-noisy.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ms-en-augment-v1.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:16, 45241.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ms-en-augment-v2.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:16, 43887.04it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ms-en-augment-v3.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:16, 44272.03it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ms-en-augment-v4.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "736633it [00:17, 42913.42it/s]\n"
     ]
    }
   ],
   "source": [
    "noisy_files = sorted(glob('ms-en-augment-v*.jsonl'))\n",
    "\n",
    "with open('train-noisy.json', 'a') as fopen_jsonl:\n",
    "    for f in noisy_files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                if random.random() > 0.3:\n",
    "                    continue\n",
    "                data = json.loads(l)\n",
    "                for d in data:\n",
    "                    if len(d['augmented']) and len(d['right']) and len(d['augmented'].split()) < 256 and len(d['right'].split()) < 256:\n",
    "                        d = {\"translation\": {\"src\": d['augmented'], \"tgt\": d['right'], 'prefix': 'terjemah Melayu ke Inggeris: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"pendapat yang beliau pilh.\", \"tgt\": \"the view he chooses.\", \"prefix\": \"terjemah Melayu ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 1 train-noisy.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test.json', 'w') as fopen_jsonl:\n",
    "    with open('eng_Latn.dev') as fopen:\n",
    "        left = fopen.read().split('\\n')\n",
    "    \n",
    "    with open('zsm_Latn.dev') as fopen:\n",
    "        right = fopen.read().split('\\n')\n",
    "        \n",
    "    for i in tqdm(range(len(left))):\n",
    "        if len(left[i]) and len(right[i]) and len(left[i].split()) < 256 and len(right[i].split()) < 256:\n",
    "            d = {\"translation\": {\"src\": left[i], \"tgt\": right[i], 'prefix': 'terjemah Inggeris ke Melayu: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            d = {\"translation\": {\"src\": right[i], \"tgt\": left[i], 'prefix': 'terjemah Melayu ke Inggeris: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
